Inspect Data

suppressPackageStartupMessages(library(tidyverse))
library(targets)
library(tarchetypes)
library(DT)
knitr::opts_knit$set(root.dir = "../../")

Modelling Matrix

Original

df_mm <- tar_read(df_mm)

df_mm %>%
  select(category_id, activity_id_new, has_finding, everything())
## # A tibble: 4,055 × 185
##    category_id activity_id_new has_finding start_date n_visit n_unsch_visit
##    <chr>       <chr>           <chr>       <date>       <dbl>         <dbl>
##  1 cnsn        00001           yes         2015-01-01     732            26
##  2 cnsn        00003           yes         2014-01-01      NA            NA
##  3 cnsn        00004           yes         2015-01-01     181            33
##  4 cnsn        00006           yes         2014-01-01       0             0
##  5 cnsn        00008           yes         2015-01-01     114             0
##  6 cnsn        00009           yes         2015-01-01     297             0
##  7 cnsn        00012           yes         2015-01-01     446             7
##  8 cnsn        00013           yes         2015-01-01     688            65
##  9 cnsn        00015           yes         2015-01-01     141            17
## 10 cnsn        00016           yes         2015-01-01       0             0
## # … with 4,045 more rows, and 179 more variables: n_sched_visit <dbl>,
## #   ratio_unsch_visit <dbl>, ratio_unsch_visit_rnk <dbl>, n_ae <dbl>,
## #   n_sae <dbl>, ae_per_visit <dbl>, sae_per_visit <dbl>,
## #   ae_per_visit_rnk <dbl>, sae_per_visit_rnk <dbl>,
## #   median_ae_reporting_delay <dbl>, mean_ae_reporting_delay <dbl>,
## #   max_ae_reporting_delay <dbl>, median_sae_reporting_delay <dbl>,
## #   mean_sae_reporting_delay <dbl>, max_sae_reporting_delay <dbl>,
## #   n_patients <dbl>, therapeutic_area <chr>, n_active_sites_pi_yy <dbl>,
## #   n_active_sites_pi_yy_rnk <dbl>, n_active_trials_at_site_in_ta_yy <dbl>,
## #   n_active_trials_at_site_in_ta_yy_rnk <dbl>, time_on_study_dd <dbl>,
## #   dev_data_available <chr>, n_maj_dev <dbl>, n_min_dev <dbl>,
## #   n_maj_dev_per_daysonstudy <dbl>, n_min_dev_per_daysonstudy <dbl>,
## #   n_maj_dev_per_daysonstudy_rnk <dbl>, n_min_dev_per_daysonstudy_rnk <dbl>,
## #   issue_data_available <chr>, mean_iss_completion_time <dbl>,
## #   median_iss_completion_time <dbl>, max_iss_completion_time <dbl>,
## #   n_iss_open <dbl>, n_iss_open_per_pat <dbl>, n_iss_due <dbl>,
## #   n_iss_compl <dbl>, n_iss_compl_per_daysonstudy <dbl>, n_iss_late <dbl>,
## #   n_iss_cnsn_open <dbl>, n_iss_cnsn_due <dbl>, n_iss_cnsn_compl <dbl>,
## #   n_iss_cnsn_late <dbl>, n_iss_dtin_open <dbl>, n_iss_dtin_due <dbl>,
## #   n_iss_dtin_compl <dbl>, n_iss_dtin_late <dbl>, n_iss_ptpe_open <dbl>,
## #   n_iss_ptpe_due <dbl>, n_iss_ptpe_compl <dbl>, n_iss_ptpe_late <dbl>,
## #   n_iss_srpo_open <dbl>, n_iss_srpo_due <dbl>, n_iss_srpo_compl <dbl>,
## #   n_iss_srpo_late <dbl>, n_iss_spno_open <dbl>, n_iss_spno_due <dbl>,
## #   n_iss_spno_compl <dbl>, n_iss_spno_late <dbl>, n_iss_sfty_open <dbl>,
## #   n_iss_sfty_due <dbl>, n_iss_sfty_compl <dbl>, n_iss_sfty_late <dbl>,
## #   n_iss_stdc_open <dbl>, n_iss_stdc_due <dbl>, n_iss_stdc_compl <dbl>,
## #   n_iss_stdc_late <dbl>, screen_failure_ratio <dbl>, protocol_version <dbl>,
## #   countrycode <chr>, region <chr>, subregion <chr>, is_engl_prim_lang <dbl>,
## #   subregion_alt <chr>, is_apac <dbl>, is_eu_east <dbl>, is_eu <dbl>,
## #   is_us_nz_ca_au <dbl>, is_sa <dbl>, is_afr <dbl>, prob_low_prob_ur <dbl>,
## #   is_prob_ur_smp05 <dbl>, is_prob_ur_smp1 <dbl>, is_prob_ur_p5_p95 <dbl>,
## #   is_prob_ur_grp95 <dbl>, is_prob_ur_grp75 <dbl>, studyphase <chr>,
## #   is_pediatric <dbl>, blinding <chr>, comparison <chr>,
## #   therapeuticarea <chr>, targetnumcountries <dbl>, randomization <chr>,
## #   generalindication <chr>, is_dementia <dbl>, is_ad <dbl>, is_cancer <dbl>,
## #   is_neuro_or_psychiatric <dbl>, is_autoimmune <dbl>,
## #   distinct_unsch_visits <dbl>, …
tibble(columns = colnames(df_mm)) %>%
  DT::datatable()

Binned

df_mm_bin <- tar_read(df_mm_bin)

df_mm_bin %>%
  select(category_id, activity_id_new, has_finding, everything())
## # A tibble: 4,055 × 861
##    category_id activity_id_new has_finding nvisitLL nvisitML nvisitM nvisitMH
##    <chr>       <chr>           <chr>          <dbl>    <dbl>   <dbl>    <dbl>
##  1 cnsn        00001           yes                0        0       0        1
##  2 cnsn        00003           yes                0        0       0        0
##  3 cnsn        00004           yes                0        0       1        0
##  4 cnsn        00006           yes                1        0       0        0
##  5 cnsn        00008           yes                0        1       0        0
##  6 cnsn        00009           yes                0        0       1        0
##  7 cnsn        00012           yes                0        0       0        1
##  8 cnsn        00013           yes                0        0       0        1
##  9 cnsn        00015           yes                0        1       0        0
## 10 cnsn        00016           yes                1        0       0        0
## # … with 4,045 more rows, and 854 more variables: nvisitHH <dbl>,
## #   nvisitNA <dbl>, nunschvisitLL <dbl>, nunschvisitML <dbl>,
## #   nunschvisitM <dbl>, nunschvisitMH <dbl>, nunschvisitHH <dbl>,
## #   nunschvisitNA <dbl>, nschedvisitLL <dbl>, nschedvisitML <dbl>,
## #   nschedvisitM <dbl>, nschedvisitMH <dbl>, nschedvisitHH <dbl>,
## #   nschedvisitNA <dbl>, ratiounschvisitLL <dbl>, ratiounschvisitML <dbl>,
## #   ratiounschvisitM <dbl>, ratiounschvisitMH <dbl>, ratiounschvisitHH <dbl>,
## #   ratiounschvisitNA <dbl>, ratiounschvisitrnkLL <dbl>,
## #   ratiounschvisitrnkML <dbl>, ratiounschvisitrnkM <dbl>,
## #   ratiounschvisitrnkMH <dbl>, ratiounschvisitrnkHH <dbl>,
## #   ratiounschvisitrnkNA <dbl>, naeLL <dbl>, naeML <dbl>, naeM <dbl>,
## #   naeMH <dbl>, naeHH <dbl>, naeNA <dbl>, nsaeLL <dbl>, nsaeML <dbl>,
## #   nsaeM <dbl>, nsaeMH <dbl>, nsaeHH <dbl>, nsaeNA <dbl>, aepervisitLL <dbl>,
## #   aepervisitML <dbl>, aepervisitM <dbl>, aepervisitMH <dbl>,
## #   aepervisitHH <dbl>, aepervisitNA <dbl>, saepervisitLL <dbl>,
## #   saepervisitML <dbl>, saepervisitM <dbl>, saepervisitMH <dbl>,
## #   saepervisitHH <dbl>, saepervisitNA <dbl>, aepervisitrnkLL <dbl>,
## #   aepervisitrnkML <dbl>, aepervisitrnkM <dbl>, aepervisitrnkMH <dbl>,
## #   aepervisitrnkHH <dbl>, aepervisitrnkNA <dbl>, saepervisitrnkLL <dbl>,
## #   saepervisitrnkML <dbl>, saepervisitrnkM <dbl>, saepervisitrnkMH <dbl>,
## #   saepervisitrnkHH <dbl>, saepervisitrnkNA <dbl>,
## #   medianaereportingdelayLL <dbl>, medianaereportingdelayML <dbl>,
## #   medianaereportingdelayM <dbl>, medianaereportingdelayMH <dbl>,
## #   medianaereportingdelayHH <dbl>, medianaereportingdelayNA <dbl>,
## #   meanaereportingdelayLL <dbl>, meanaereportingdelayML <dbl>,
## #   meanaereportingdelayM <dbl>, meanaereportingdelayMH <dbl>,
## #   meanaereportingdelayHH <dbl>, meanaereportingdelayNA <dbl>,
## #   maxaereportingdelayLL <dbl>, maxaereportingdelayML <dbl>,
## #   maxaereportingdelayM <dbl>, maxaereportingdelayMH <dbl>,
## #   maxaereportingdelayHH <dbl>, maxaereportingdelayNA <dbl>,
## #   mediansaereportingdelayLL <dbl>, mediansaereportingdelayML <dbl>,
## #   mediansaereportingdelayM <dbl>, mediansaereportingdelayMH <dbl>,
## #   mediansaereportingdelayHH <dbl>, mediansaereportingdelayNA <dbl>,
## #   meansaereportingdelayLL <dbl>, meansaereportingdelayML <dbl>,
## #   meansaereportingdelayM <dbl>, meansaereportingdelayMH <dbl>,
## #   meansaereportingdelayHH <dbl>, meansaereportingdelayNA <dbl>,
## #   maxsaereportingdelayLL <dbl>, maxsaereportingdelayML <dbl>,
## #   maxsaereportingdelayM <dbl>, maxsaereportingdelayMH <dbl>,
## #   maxsaereportingdelayHH <dbl>, maxsaereportingdelayNA <dbl>,
## #   npatientsLL <dbl>, npatientsML <dbl>, …
tibble(columns = colnames(df_mm_bin)) %>%
  DT::datatable()

Coefficients

Modelling coefficients have been preselected.

tar_read(df_form) %>%
  DT::datatable()

Cross Validation Indeces

Indeces of modelling matrix that defines time series cross validation strategy.

tar_read(df_cv)
## # A tibble: 45 × 4
##    year_start_act category_id index_past              index_next_year           
##             <dbl> <chr>       <chr>                   <chr>                     
##  1           2011 cnsn        70,71,72,84,85,86,87,8… 155,170,171,172,173,174,1…
##  2           2012 cnsn        70,71,72,84,85,86,87,8… 211,229,231,234,241,242,2…
##  3           2013 cnsn        70,71,72,84,85,86,87,8… 2,4,306,307,308,309,310,3…
##  4           2014 cnsn        2,4,70,71,72,84,85,86,… 1,3,5,6,7,8,9,10,11,12,13…
##  5           2015 cnsn        1,2,3,4,5,6,7,8,9,10,1… 30,34,45,46,47,48,49,50,5…
##  6           2016 cnsn        1,2,3,4,5,6,7,8,9,10,1… 98,99,100,101,102,103,106…
##  7           2017 cnsn        1,2,3,4,5,6,7,8,9,10,1… 228,238,239,240,248,251,2…
##  8           2018 cnsn        1,2,3,4,5,6,7,8,9,10,1… 358,359,360,361,362,363,3…
##  9           2019 cnsn        1,2,3,4,5,6,7,8,9,10,1… 368,805,808               
## 10           2015 dtin        5678,5679,5680,5681,56… 5698,5714,5715,5724,5725,…
## # … with 35 more rows

Lookup Tables

Features

All names of all features and their variations.

tar_read(df_feat_lookup) %>%
  DT::datatable()

Categories

All finding statements mapped to clinical impact factors.

tar_read(df_cat_lookup) %>%
  DT::datatable()